candy_file <- "candy.csv"

candy =  read.csv(candy_file, row.names=1)
head(candy)

#Q1. How many different candy types are in this dataset? #Answer 85

nrow(candy)
## [1] 85

#Q2. How many fruity candy types are in the dataset?

sum(candy[,"fruity"])
## [1] 38
sum(candy[,"chocolate"])
## [1] 37

#Q3. What is your favorite candy in the dataset and what is it’s winpercent value?

candy["Almond Joy", ]$winpercent
## [1] 50.34755

#side-note: the skimr::skim() function

There is a useful skim() function in the skimr package that can help give you a quick overview of a given dataset. Let’s install this package and try it on our candy data.

library(skimr)
skim(candy)
Data summary
Name candy
Number of rows 85
Number of columns 12
_______________________
Column type frequency:
numeric 12
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
chocolate 0 1 0.44 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
fruity 0 1 0.45 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
caramel 0 1 0.16 0.37 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
peanutyalmondy 0 1 0.16 0.37 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
nougat 0 1 0.08 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
crispedricewafer 0 1 0.08 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
hard 0 1 0.18 0.38 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
bar 0 1 0.25 0.43 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
pluribus 0 1 0.52 0.50 0.00 0.00 1.00 1.00 1.00 ▇▁▁▁▇
sugarpercent 0 1 0.48 0.28 0.01 0.22 0.47 0.73 0.99 ▇▇▇▇▆
pricepercent 0 1 0.47 0.29 0.01 0.26 0.47 0.65 0.98 ▇▇▇▇▆
winpercent 0 1 50.32 14.71 22.45 39.14 47.83 59.86 84.18 ▃▇▆▅▂

#QQ6. Is there any variable/column that looks to be on a different scale to the majority of the other columns in the dataset? - winpercent

#Q7. What do you think a zero and one represent for the candy$chocolate column?

#Q8. Plot a histogram of winpercent values

hist(candy$winpercent)

#Q9. Is the distribution of winpercent values symmetrical?

#Q10. Is the center of the distribution above or below 50%? Below

#Q11. On average is chocolate candy higher or lower ranked than fruit candy? #as logical to return which candy is chocolate

as.logical(candy$chocolate)
##  [1]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
## [25]  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE
## [37]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE
## [49] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE
## [61] FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
## [85]  TRUE

#for all chocolate in candy what is the winpercent

candy[as.logical(candy$chocolate),"winpercent"]
##  [1] 66.97173 67.60294 50.34755 56.91455 38.97504 55.37545 62.28448 56.49050
##  [9] 59.23612 57.21925 76.76860 71.46505 66.57458 55.06407 73.09956 60.80070
## [17] 64.35334 47.82975 54.52645 70.73564 66.47068 69.48379 81.86626 84.18029
## [25] 73.43499 72.88790 65.71629 34.72200 37.88719 76.67378 59.52925 48.98265
## [33] 43.06890 45.73675 49.65350 81.64291 49.52411

#get the average of these

chocolate <- candy[as.logical(candy$chocolate), "winpercent"]
mean(chocolate)
## [1] 60.92153
fruity <- candy[as.logical(candy$fruity), "winpercent"]
mean(fruity)
## [1] 44.11974

#it matters the order you put chocolate and candy.

t.test(chocolate, fruity)
## 
##  Welch Two Sample t-test
## 
## data:  chocolate and fruity
## t = 6.2582, df = 68.882, p-value = 2.871e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  11.44563 22.15795
## sample estimates:
## mean of x mean of y 
##  60.92153  44.11974

#3. Overall Candy Rankings Let’s use the base R order() function together with head() to sort the whole dataset by winpercent. Or if you have been getting into the tidyverse and the dplyr package you can use the arrange() function together with head() to do the same thing and answer the following questions:

library(ggplot2)
ggplot(candy) +
  aes(winpercent, rownames(candy)) +
  geom_col()

#we need to improve this to reorder the candy by the winpercent values

library(ggplot2)
ggplot(candy) +
  aes(winpercent,reorder(rownames(candy), winpercent)) +
  geom_col()

#create colors for every candy type

my_cols=rep("black", nrow(candy))
my_cols[as.logical(candy$chocolate)] = "chocolate"
my_cols[as.logical(candy$bar)] = "brown"
my_cols[as.logical(candy$fruity)] = "pink"
ggplot(candy) + 
  aes(winpercent, reorder(rownames(candy),winpercent)) +
  geom_col(fill=my_cols)

ggplot(candy) +
  aes(winpercent, pricepercent, label=rownames(candy)) +
  geom_point(col=my_cols) +
  geom_text()

#4. Taking a look at pricepercent What about value for money? What is the the best candy for the least money? One way to get at this would be to make a plot of winpercent vs the pricepercent variable. The pricepercent variable records the percentile rank of the candy’s price against all the other candies in the dataset. Lower vales are less expensive and high values more expensive.

To this plot we will add text labels so we can more easily identify a given candy. There is a regular geom_label() that comes with ggplot2. However, as there are quite a few candys in our dataset lots of these labels will be overlapping and hard to read. To help with this we can use the geom_text_repel() function from the ggrepel package.

library(ggrepel)

ggplot(candy) +
  aes(winpercent, pricepercent, label=rownames(candy)) +
  geom_point(col=my_cols) + 
  geom_text_repel(col=my_cols, size=3.3, max.overlaps = 10)
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#Q19. Which candy type is the highest ranked in terms of winpercent for the least money - i.e. offers the most bang for your buck? Reese’s minatures

rownames(candy)
##  [1] "100 Grand"                   "3 Musketeers"               
##  [3] "One dime"                    "One quarter"                
##  [5] "Air Heads"                   "Almond Joy"                 
##  [7] "Baby Ruth"                   "Boston Baked Beans"         
##  [9] "Candy Corn"                  "Caramel Apple Pops"         
## [11] "Charleston Chew"             "Chewey Lemonhead Fruit Mix" 
## [13] "Chiclets"                    "Dots"                       
## [15] "Dum Dums"                    "Fruit Chews"                
## [17] "Fun Dip"                     "Gobstopper"                 
## [19] "Haribo Gold Bears"           "Haribo Happy Cola"          
## [21] "Haribo Sour Bears"           "Haribo Twin Snakes"         
## [23] "HersheyÕs Kisses"            "HersheyÕs Krackel"          
## [25] "HersheyÕs Milk Chocolate"    "HersheyÕs Special Dark"     
## [27] "Jawbusters"                  "Junior Mints"               
## [29] "Kit Kat"                     "Laffy Taffy"                
## [31] "Lemonhead"                   "Lifesavers big ring gummies"
## [33] "Peanut butter M&MÕs"         "M&MÕs"                      
## [35] "Mike & Ike"                  "Milk Duds"                  
## [37] "Milky Way"                   "Milky Way Midnight"         
## [39] "Milky Way Simply Caramel"    "Mounds"                     
## [41] "Mr Good Bar"                 "Nerds"                      
## [43] "Nestle Butterfinger"         "Nestle Crunch"              
## [45] "Nik L Nip"                   "Now & Later"                
## [47] "Payday"                      "Peanut M&Ms"                
## [49] "Pixie Sticks"                "Pop Rocks"                  
## [51] "Red vines"                   "ReeseÕs Miniatures"         
## [53] "ReeseÕs Peanut Butter cup"   "ReeseÕs pieces"             
## [55] "ReeseÕs stuffed with pieces" "Ring pop"                   
## [57] "Rolo"                        "Root Beer Barrels"          
## [59] "Runts"                       "Sixlets"                    
## [61] "Skittles original"           "Skittles wildberry"         
## [63] "Nestle Smarties"             "Smarties candy"             
## [65] "Snickers"                    "Snickers Crisper"           
## [67] "Sour Patch Kids"             "Sour Patch Tricksters"      
## [69] "Starburst"                   "Strawberry bon bons"        
## [71] "Sugar Babies"                "Sugar Daddy"                
## [73] "Super Bubble"                "Swedish Fish"               
## [75] "Tootsie Pop"                 "Tootsie Roll Juniors"       
## [77] "Tootsie Roll Midgies"        "Tootsie Roll Snack Bars"    
## [79] "Trolli Sour Bites"           "Twix"                       
## [81] "Twizzlers"                   "Warheads"                   
## [83] "WelchÕs Fruit Snacks"        "WertherÕs Original Caramel" 
## [85] "Whoppers"

#change Õ in the rownames to a ’. gsub = global substitute

gsub("Õ", "'", rownames(candy))
##  [1] "100 Grand"                   "3 Musketeers"               
##  [3] "One dime"                    "One quarter"                
##  [5] "Air Heads"                   "Almond Joy"                 
##  [7] "Baby Ruth"                   "Boston Baked Beans"         
##  [9] "Candy Corn"                  "Caramel Apple Pops"         
## [11] "Charleston Chew"             "Chewey Lemonhead Fruit Mix" 
## [13] "Chiclets"                    "Dots"                       
## [15] "Dum Dums"                    "Fruit Chews"                
## [17] "Fun Dip"                     "Gobstopper"                 
## [19] "Haribo Gold Bears"           "Haribo Happy Cola"          
## [21] "Haribo Sour Bears"           "Haribo Twin Snakes"         
## [23] "Hershey's Kisses"            "Hershey's Krackel"          
## [25] "Hershey's Milk Chocolate"    "Hershey's Special Dark"     
## [27] "Jawbusters"                  "Junior Mints"               
## [29] "Kit Kat"                     "Laffy Taffy"                
## [31] "Lemonhead"                   "Lifesavers big ring gummies"
## [33] "Peanut butter M&M's"         "M&M's"                      
## [35] "Mike & Ike"                  "Milk Duds"                  
## [37] "Milky Way"                   "Milky Way Midnight"         
## [39] "Milky Way Simply Caramel"    "Mounds"                     
## [41] "Mr Good Bar"                 "Nerds"                      
## [43] "Nestle Butterfinger"         "Nestle Crunch"              
## [45] "Nik L Nip"                   "Now & Later"                
## [47] "Payday"                      "Peanut M&Ms"                
## [49] "Pixie Sticks"                "Pop Rocks"                  
## [51] "Red vines"                   "Reese's Miniatures"         
## [53] "Reese's Peanut Butter cup"   "Reese's pieces"             
## [55] "Reese's stuffed with pieces" "Ring pop"                   
## [57] "Rolo"                        "Root Beer Barrels"          
## [59] "Runts"                       "Sixlets"                    
## [61] "Skittles original"           "Skittles wildberry"         
## [63] "Nestle Smarties"             "Smarties candy"             
## [65] "Snickers"                    "Snickers Crisper"           
## [67] "Sour Patch Kids"             "Sour Patch Tricksters"      
## [69] "Starburst"                   "Strawberry bon bons"        
## [71] "Sugar Babies"                "Sugar Daddy"                
## [73] "Super Bubble"                "Swedish Fish"               
## [75] "Tootsie Pop"                 "Tootsie Roll Juniors"       
## [77] "Tootsie Roll Midgies"        "Tootsie Roll Snack Bars"    
## [79] "Trolli Sour Bites"           "Twix"                       
## [81] "Twizzlers"                   "Warheads"                   
## [83] "Welch's Fruit Snacks"        "Werther's Original Caramel" 
## [85] "Whoppers"

Q20. What are the top 5 most expensive candy types in the dataset and of these which is the least popular?

ord <- order(candy$pricepercent, decreasing = FALSE)
head( candy[ord,c(11,12)], n=5 )
library(corrplot)
## corrplot 0.90 loaded

#correlation structure

cij <- cor(candy)
corrplot(cij)

#PCA using prcomp #need to scale the data and use summary to see it Side-note: Feel free to examine what happens if you leave this argument out (i.e. use the default scale=FALSE). Then examine the summary(pca) and pca$rotation[,1] component and see that it is dominated by winpercent (which is after all measured on a very different scale than the other variables).

pca <- prcomp(candy, scale =TRUE)
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4    PC5     PC6     PC7
## Standard deviation     2.0788 1.1378 1.1092 1.07533 0.9518 0.81923 0.81530
## Proportion of Variance 0.3601 0.1079 0.1025 0.09636 0.0755 0.05593 0.05539
## Cumulative Proportion  0.3601 0.4680 0.5705 0.66688 0.7424 0.79830 0.85369
##                            PC8     PC9    PC10    PC11    PC12
## Standard deviation     0.74530 0.67824 0.62349 0.43974 0.39760
## Proportion of Variance 0.04629 0.03833 0.03239 0.01611 0.01317
## Cumulative Proportion  0.89998 0.93832 0.97071 0.98683 1.00000
plot(pca$x[,1:2], col=my_cols, pch=16)

#can make a nicer plot with ggplot. need a input data.frame that includes a seperate column for each of the aesthetics you would like displayed. To accomplish this we make a new data.frame here that contains our PCA results with all the rest of our candy data. We will then use this for making plots below cbind is combining candy df and pca 1 -3

my_data <- cbind(candy, pca$x[,1:3])
my_data
p <- ggplot(my_data) + 
        aes(x=PC1, y=PC2, 
            size=winpercent/100,  
            text=rownames(my_data),
            label=rownames(my_data)) +
        geom_point(col=my_cols)

p

library(ggrepel)

p + geom_text_repel(size=3.3, col=my_cols, max.overlaps = 7)  + 
  theme(legend.position = "none") +
  labs(title="Halloween Candy PCA Space",
       subtitle="Colored by type: chocolate bar (dark brown), chocolate other (light brown), fruity (red), other (black)",
       caption="Data from 538")
## Warning: ggrepel: 39 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ggplotly(p)
par(mar=c(8,4,2,2))
barplot(pca$rotation[,1], las=2, ylab="PC1 Contribution")

#Q24. What original variables are picked up strongly by PC1 in the positive direction? Do these make sense to you?